import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
trainData = pd.read_csv('./dataset/MNIST/train_data.csv', header = None)
trainLabels = pd.read_csv('./dataset/MNIST/train_label.csv', header = None, names = ['label'])
testData = pd.read_csv('./dataset/MNIST/test_data.csv', header = None)
testLabels = pd.read_csv('./dataset/MNIST/test_label.csv', header = None, names = ['label'])
def visualize(digit):
image = np.array(digit, dtype='float')
pixels = image.reshape((28, 28))
plt.imshow(pixels, cmap='gray')
plt.show()
digitNine = trainData.loc[trainLabels['label']==9]
nineDigit =digitNine.iloc[0]
visualize(nineDigit)
1.
K-Nearest Neighbors is a classification algorithms in Machine Learning which belongs to the supervised learning domain.
KNN algorithm is based on the assumption that similar data points are close to each other.
Given an unclassified point, we can assign it to a group by observing what group its nearest neighbors belong to; So a proper function should be used based on the problem for calculating the distance between points on the feature space to find nearest neighbors.
2.
from sklearn.neighbors import KNeighborsClassifier
for i in range(1, 7):
neighTest = KNeighborsClassifier(n_neighbors = i)
neighTest.fit(trainData.values, trainLabels.values.ravel() )
print("k =",i, " score =", neighTest.score(testData.values, testLabels.values.ravel()))
Using score function we can observe that by increasing k from 1 to 4, the score increases and after that score decreases as we increase k.
So we will use k = 4
k= 4
neighClf = KNeighborsClassifier(n_neighbors=k)
neighClf.fit(trainData.values, trainLabels.values.ravel())
testNum = 200
pred = neighClf.predict(testData.iloc[0:testNum].values)
correctPred = 0;
for index, predicted in enumerate(pred):
if predicted == testLabels.values[index]:
correctPred += 1
print("Accuracy: ", correctPred / testNum)
for index, digit in enumerate(testData.iloc[0:5].values):
visualize(digit)
print("Predicted digit:", pred[index])
3.
n1 = 150
n2 = 500
testAcc = []
trainAcc = []
kRange = range(1, 7)
for i in kRange:
neighTest = KNeighborsClassifier(n_neighbors = i)
neighTest.fit(trainData.values, trainLabels.values.ravel() )
trainAcc.append(neighTest.score(trainData.iloc[0:n2].values, trainLabels.iloc[0:n2].values.ravel()))
testAcc.append(neighTest.score(testData.iloc[0:n1].values, testLabels.iloc[0:n1].values.ravel()))
plt.plot(kRange, trainAcc)
plt.plot(kRange, testAcc)
plt.legend(['Accuracy on train data', 'Accuracy on test data'], loc='lower right')
plt.xlabel('n_neighbors')
plt.ylabel('accuracy')
plt.title('Effect of n_neighbors on accuracy of k-nearest neighbors algorithm')
plt.show()
Accuracy on test data is maximum when k = 4
4.
Obviously accuracy on train data is maximum in k = 1 as each example will get its own label and by increasing k in accuracy decreases and loss increases as it will match the example with more samples which are not that sample exactly.
5.
testExampleIndex = 2
print("Test example:")
visualize(testData.iloc[testExampleIndex].values)
thisNeigh = neighClf.kneighbors([testData.iloc[testExampleIndex]])
print("Neighbors: ")
for n in thisNeigh[1][0]:
visualize(trainData.iloc[n].values)
6.
7.
Decison trees are a non-parametric supervised learning method used for classification.
A Decision tree is a flowchart like tree structure, where each internal node denotes a test on an attribute, each branch represents an outcome of the test, and each leaf node holds a class label.
8.
from sklearn.tree import DecisionTreeClassifier
treeClf = DecisionTreeClassifier(max_depth = 13, random_state = 0, min_samples_leaf = 1, max_leaf_nodes = 170
, min_samples_split = 4, max_features = 200)
treeClf.fit(trainData.values, trainLabels.values.ravel())
pred = treeClf.predict(testData.iloc[0:200].values)
correctPred = 0;
for index, predicted in enumerate(pred):
if predicted == testLabels.values[index]:
correctPred += 1
print("Accuracy:", correctPred / testNum, "\n\n")
print("Some of test examples")
print("_"*50)
for index, digit in enumerate(testData.iloc[0:5].values):
print("Test example:")
visualize(digit)
print("Predicted digit:", pred[index])
print("_"*50)
9.
testAccTree = []
trainAccTree = []
maxDepthRange = range(6, 20)
for i in maxDepthRange:
treeTest = DecisionTreeClassifier(max_depth = i, random_state = 0, min_samples_leaf = 1, max_leaf_nodes = 170
, min_samples_split = 4, max_features = 200)
treeTest.fit(trainData.values, trainLabels.values.ravel())
trainAccTree.append( treeTest.score(trainData.values, trainLabels.values.ravel()))
testAccTree.append( treeTest.score(testData.values, testLabels.values.ravel()))
plt.plot(maxDepthRange, trainAccTree)
plt.plot(maxDepthRange, testAccTree)
plt.legend(['Accuracy on train data', 'Accuracy on test data'], loc='lower right')
plt.xlabel('max_depth')
plt.ylabel('accuracy')
plt.title('Effect of max_depth on decision tree algorithm')
plt.show()
The optimum accuracy on test data occurs when max_depth = 12
10.
from sklearn.tree import export_graphviz
import pydotplus
from IPython.display import Image
dotData = export_graphviz(treeClf, out_file=None, filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dotData)
Image(graph.create_png())